In [1]:
#package
import pandas as pd
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statistics import median
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [2]:
# Load data
path = 'D:/USA 2021 Spring/Data_analysis/train.csv'
traindf = pd.read_csv(path)
In [3]:
path = 'D:/USA 2021 Spring/Data_analysis/users.csv'
userdf = pd.read_csv(path)
In [4]:
path = 'D:/USA 2021 Spring/Data_analysis/test.csv'
testdf = pd.read_csv(path)

Step 1 Exploration

Data information

In [5]:
traindf.head()
Out[5]:
country_code grass_date user_id subject_line_length last_open_day last_login_day last_checkout_day open_count_last_10_days open_count_last_30_days open_count_last_60_days login_count_last_10_days login_count_last_30_days login_count_last_60_days checkout_count_last_10_days checkout_count_last_30_days checkout_count_last_60_days open_flag row_id
0 4 2019-07-16 00:00:00+08:00 43 44 19 6 18 0 2 4 12 43 99 0 5 10 0 0
1 4 2019-07-16 00:00:00+08:00 102 44 9 4 8 2 9 17 18 48 90 1 1 4 1 1
2 6 2019-07-16 00:00:00+08:00 177 49 14 5 5 0 4 12 24 69 119 5 19 27 0 2
3 1 2019-07-16 00:00:00+08:00 184 49 49 9 53 0 0 1 9 23 69 1 3 6 0 3
4 6 2019-07-16 00:00:00+08:00 221 49 227 6 221 0 0 0 2 5 5 0 0 0 0 4
In [6]:
traindf['grass_date'] = pd.to_datetime(traindf['grass_date']).dt.date

testdf['grass_date'] = pd.to_datetime(testdf['grass_date']).dt.date
In [7]:
traindf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73539 entries, 0 to 73538
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   country_code                 73539 non-null  int64 
 1   grass_date                   73539 non-null  object
 2   user_id                      73539 non-null  int64 
 3   subject_line_length          73539 non-null  int64 
 4   last_open_day                73539 non-null  object
 5   last_login_day               73539 non-null  object
 6   last_checkout_day            73539 non-null  object
 7   open_count_last_10_days      73539 non-null  int64 
 8   open_count_last_30_days      73539 non-null  int64 
 9   open_count_last_60_days      73539 non-null  int64 
 10  login_count_last_10_days     73539 non-null  int64 
 11  login_count_last_30_days     73539 non-null  int64 
 12  login_count_last_60_days     73539 non-null  int64 
 13  checkout_count_last_10_days  73539 non-null  int64 
 14  checkout_count_last_30_days  73539 non-null  int64 
 15  checkout_count_last_60_days  73539 non-null  int64 
 16  open_flag                    73539 non-null  int64 
 17  row_id                       73539 non-null  int64 
dtypes: int64(14), object(4)
memory usage: 10.1+ MB
In [8]:
traindf.describe()
Out[8]:
country_code user_id subject_line_length open_count_last_10_days open_count_last_30_days open_count_last_60_days login_count_last_10_days login_count_last_30_days login_count_last_60_days checkout_count_last_10_days checkout_count_last_30_days checkout_count_last_60_days open_flag row_id
count 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.00000
mean 2.533159 63048.595358 43.656373 0.911829 2.780457 5.019364 8.382287 25.284053 49.010430 0.909898 2.796040 5.420960 0.155781 36769.00000
std 1.675251 36068.006037 11.219323 1.754052 4.570239 7.902313 13.424436 37.853189 71.819327 2.887416 7.686064 13.980182 0.362650 21229.02506
min 1.000000 2.000000 10.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000
25% 1.000000 31679.500000 34.000000 0.000000 0.000000 0.000000 0.000000 1.000000 3.000000 0.000000 0.000000 0.000000 0.000000 18384.50000
50% 2.000000 63340.000000 43.000000 0.000000 1.000000 2.000000 3.000000 11.000000 22.000000 0.000000 0.000000 1.000000 0.000000 36769.00000
75% 3.000000 94731.500000 52.000000 1.000000 3.000000 6.000000 12.000000 35.000000 69.000000 1.000000 3.000000 6.000000 0.000000 55153.50000
max 7.000000 127925.000000 75.000000 31.000000 92.000000 135.000000 254.000000 735.000000 1260.000000 157.000000 531.000000 783.000000 1.000000 73538.00000
In [9]:
for col in traindf.columns:
    if traindf[col].dtype == "object":
        print (traindf[col].unique())
[datetime.date(2019, 7, 16) datetime.date(2019, 7, 17)
 datetime.date(2019, 7, 18) datetime.date(2019, 7, 19)
 datetime.date(2019, 7, 20) datetime.date(2019, 7, 21)
 datetime.date(2019, 7, 22) datetime.date(2019, 7, 23)
 datetime.date(2019, 7, 24) datetime.date(2019, 7, 25)
 datetime.date(2019, 7, 26) datetime.date(2019, 7, 27)
 datetime.date(2019, 7, 28) datetime.date(2019, 7, 29)
 datetime.date(2019, 7, 30) datetime.date(2019, 8, 1)
 datetime.date(2019, 8, 2) datetime.date(2019, 8, 3)
 datetime.date(2019, 8, 4) datetime.date(2019, 8, 5)
 datetime.date(2019, 8, 6) datetime.date(2019, 8, 7)
 datetime.date(2019, 8, 8) datetime.date(2019, 8, 9)
 datetime.date(2019, 8, 10) datetime.date(2019, 8, 11)
 datetime.date(2019, 8, 12) datetime.date(2019, 8, 13)
 datetime.date(2019, 8, 14) datetime.date(2019, 8, 15)
 datetime.date(2019, 8, 16) datetime.date(2019, 8, 17)
 datetime.date(2019, 8, 18) datetime.date(2019, 8, 19)
 datetime.date(2019, 8, 20) datetime.date(2019, 8, 21)
 datetime.date(2019, 8, 22) datetime.date(2019, 8, 23)
 datetime.date(2019, 8, 24) datetime.date(2019, 8, 25)
 datetime.date(2019, 8, 26) datetime.date(2019, 8, 27)
 datetime.date(2019, 8, 28) datetime.date(2019, 8, 29)
 datetime.date(2019, 8, 30) datetime.date(2019, 9, 1)
 datetime.date(2019, 9, 2)]
['19' '9' '14' '49' '227' '5' '13' '163' '7' '130' '27' 'Never open' '4'
 '24' '28' '12' '23' '70' '83' '26' '17' '2' '3' '182' '58' '22' '20' '10'
 '8' '82' '87' '41' '115' '207' '16' '11' '29' '77' '48' '34' '263' '25'
 '30' '56' '21' '180' '91' '90' '47' '64' '61' '401' '78' '86' '32' '76'
 '75' '55' '38' '15' '39' '36' '71' '169' '59' '224' '18' '85' '143' '208'
 '72' '140' '62' '60' '296' '164' '362' '125' '35' '168' '74' '73' '154'
 '210' '46' '33' '53' '69' '37' '93' '221' '233' '347' '42' '116' '54'
 '148' '340' '66' '96' '89' '361' '134' '701' '79' '43' '107' '293' '379'
 '268' '63' '216' '84' '138' '446' '145' '104' '98' '280' '369' '249' '97'
 '103' '170' '57' '40' '301' '552' '246' '267' '81' '297' '51' '466' '540'
 '306' '422' '44' '230' '174' '108' '314' '67' '229' '298' '326' '106'
 '333' '45' '80' '31' '211' '65' '52' '95' '315' '50' '100' '102' '158'
 '228' '256' '226' '105' '454' '277' '368' '191' '244' '137' '181' '383'
 '203' '234' '295' '279' '88' '175' '288' '173' '290' '183' '294' '269'
 '223' '92' '285' '321' '123' '313' '199' '176' '155' '124' '275' '299'
 '165' '271' '112' '253' '160' '239' '272' '141' '153' '139' '68' '189'
 '147' '94' '152' '109' '213' '1' '6' '185' '144' '327' '202' '371' '502'
 '336' '433' '177' '485' '119' '217' '142' '463' '310' '197' '437' '149'
 '464' '128' '132' '135' '198' '231' '248' '259' '316' '184' '264' '114'
 '117' '129' '136' '126' '151' '289' '495' '205' '201' '543' '342' '235'
 '714' '113' '421' '287' '178' '156' '204' '262' '179' '162' '282' '127'
 '254' '252' '266' '146' '200' '225' '118' '196' '187' '121' '101' '193'
 '319' '338' '322' '251' '260' '241' '258' '214' '270' '273' '167' '131'
 '192' '222' '304' '219' '305' '215' '283' '302' '242' '150' '238' '274'
 '206' '159' '186' '261' '286' '549' '445' '358' '303' '247' '325' '257'
 '307' '334' '291' '209' '329' '195' '237' '284' '300' '265' '278' '212'
 '190' '188' '172' '122' '133' '483' '460' '240' '588' '318' '349' '372'
 '520' '498' '684' '424' '245' '419' '456' '480' '345' '255' '335' '545'
 '111' '220' '761' '741' '697' '517' '161' '276' '157' '194' '308' '387'
 '610' '510' '365' '389' '595' '232' '417' '561' '743' '554' '711' '548'
 '481' '386' '250' '626' '351' '439' '110' '337' '309' '415' '354' '166'
 '312' '236' '120' '651' '355' '472' '344' '374' '99' '328' '569' '381'
 '171' '628' '392' '384' '774' '608' '377' '330' '350' '356' '734' '658'
 '655' '243' '292' '413' '425' '746' '564' '395' '798' '783' '643' '400'
 '352' '605' '484' '771' '800' '465' '727' '675' '331' '332' '479' '404'
 '525' '702' '533' '492' '281' '511' '582' '359' '416' '429' '766' '722'
 '423' '414' '428' '399' '324' '451' '430' '488' '516' '526' '555' '366'
 '385' '461' '473' '639' '607' '346' '320' '505' '452' '393' '402' '622'
 '435' '612' '670' '341' '786' '443' '808' '715' '339' '580' '614' '411'
 '431' '660' '441' '360' '571' '641' '506' '323' '311' '375' '317' '218'
 '585' '536' '440' '562' '573' '394' '742' '378' '343' '530' '490' '478'
 '531' '737' '457' '409' '391' '534' '418' '527' '736' '600' '602' '679'
 '705' '420' '621' '804' '450' '625' '408' '519' '631' '493' '703' '458'
 '448' '370' '603' '587' '500' '388' '547' '803' '544' '663' '397' '730'
 '763' '367' '606' '357']
['6' '4' '5' '9' '8' '3' '1' '10' '58' '44' '2' '7' '382' '135' '1396'
 '12' '11' '15' '18' '49' '114' '87' '0' '90' '25' '88' '327' '95' '22'
 '91' '211' '46' '536' '198' '14' '113' '299' '32' '107' '116' '147' '13'
 '65' '36' '19' '251' '51' '26' '21' '901' '324' '349' '115' '592' '216'
 '20' '265' '31' '16' '24' '285' '245' '460' '45' '379' '106' '178' '86'
 '60' '310' '418' '27' '29' '205' '145' '472' '427' '42' '306' '899' '292'
 '220' '267' '222' '834' '101' '43' '164' '475' '206' '805' '85' '38'
 '838' '719' '832' '362' '30' '281' '53' '208' '527' '187' '799' '573'
 '57' '136' '80' '474' '508' '707' '659' '394' '300' '252' '407' '40'
 '383' '117' '632' '83' '77' '679' '200' '693' '662' '503' '356' '17'
 '188' '336' '541' '92' '456' '66' '155' '132' '584' '143' '74' '121'
 '352' '554' '81' '227' '167' '645' '571' '307' '467' '326' '260' '628'
 '37' '140' '622' '94' '62' '215' '607' '347' '23' '105' '56' '163' '610'
 '404' '39' '73' '75' '248' '34' '408' '282' '70' '100' '283' '212' '173'
 '378' '561' '581' '84' '68' '201' '33' '579' '493' '151' '582' '259'
 '102' '577' '274' '567' '396' '269' '562' '517' '28' '502' '559' '191'
 '47' '59' '403' '547' '542' '141' '519' '272' '529' '144' '526' '520'
 '521' '337' '497' '509' '256' '262' '110' '426' '82' '50' '289' '98'
 '196' '499' '428' '236' '130' '313' '491' '103' '63' '468' '484' '193'
 '41' '481' '168' '96' '370' '463' '319' '109' '194' '429' '458' '459'
 '288' '445' '452' '448' '441' '402' '247' '67' '421' '425' '286' '128'
 '446' '325' '377' '61' '417' '596' '406' '175' '226' '400' '392' '387'
 '35' '366' '386' '345' '276' '380' '213' '317' '376' '371' '372' '368'
 '365' '364' '361' '353' '203' '359' '339' '131' '279' '355' '354' '334'
 '186' '348' '268' '241' '290' '333' '52' '64' '122' '224' '209' '161'
 '271' '311' '76' '207' '305' '294' '79' '202' '146' '249' '303' '112'
 '302' '296' '237' '291' '264' '223' '152' '280' '189' '275' '228' '270'
 '263' '55' '261' '133' '258' '183' '231' '255' '72' '253' '137' '184'
 '111' '69' '225' '232' '157' '174' '243' '239' '142' '233' '139' '221'
 '218' '120' '182' '54' '99' '170' '197' '192' 'Never login' '181' '169'
 '89' '159' '158' '154' '48' '153' '125' '123' '148' '104' '129' '127'
 '124' '118' '108' '78' '93' '71' '955' '929' '431' '119' '470' '576'
 '532' '771' '637' '166' '469' '569' '375' '618' '613' '465' '603' '575'
 '609' '321' '190' '548' '323' '479' '477' '312' '442' '438' '316' '430'
 '397' '398' '298' '648' '360' '340' '342' '217' '254' '314' '240' '246'
 '176' '172' '97' '374' '284' '235' '566' '483' '595' '510' '528' '335'
 '511' '506' '126' '308' '390' '341' '346' '293' '330' '185' '138' '369'
 '476' '250' '466' '195' '234' '447' '695' '490' '551' '453' '162' '678'
 '219' '642' '558' '614' '214' '604' '524' '589' '495' '389' '422' '487'
 '478' '320' '473' '464' '395' '451' '343' '436' '363' '230' '149' '385'
 '367' '357' '350' '329' '328' '150' '179' '177' '171' '18096' '301' '439'
 '774' '643' '199' '757' '704' '669' '266' '513' '492' '488' '318' '415'
 '391' '601' '297' '760' '393' '773' '238' '457' '859' '384' '533' '507'
 '413' '416' '437' '435' '165' '273' '399' '244' '331' '381' '242' '534'
 '847' '915' '388' '814' '694' '690' '647' '657' '681' '160' '587' '563'
 '624' '612' '544' '412' '134' '570' '552' '518' '338' '501' '498' '482'
 '480' '440' '433' '315' '277' '180' '210' '373' '358' '309' '278' '156'
 '987' '351' '705' '677' '675' '545' '666' '409' '663' '655' '620' '619'
 '546' '585' '540' '553' '539' '204' '515' '514' '505' '489' '485' '462'
 '432' '287' '295' '810' '698' '861' '322' '687' '635' '617' '578' '537'
 '538' '512' '450' '419' '692' '818' '401' '804' '933' '702' '816' '424'
 '858' '616' '572' '800' '798' '257' '731' '713' '696' '684' '658' '755'
 '615' '525' '594' '574' '410' '434' '423' '344' '229' '18102' '1291'
 '1354' '605' '813' '867' '826' '708' '729' '671' '332' '608' '543' '461'
 '405' '18103' '961' '905' '701' '975' '560' '688' '494' '626' '600' '496'
 '411' '18105' '611' '1135' '738' '557' '770' '550' '672' '667' '644'
 '633' '580' '591' '556' '304' '833' '631' '1001' '960' '912' '752' '767'
 '629' '722' '700' '665' '606' '443' '564' '685' '420' '455' '18107' '875'
 '486' '715' '454' '414' '565' '661' '792' '736' '720' '747' '718' '625'
 '449' '668' '623' '535' '1150' '907' '523' '1305' '1140' '1119' '951'
 '697' '821' '892' '881' '920' '910' '727' '888' '598' '586' '627' '664'
 '651' '593' '630' '568' '549' '471' '1332' '937' '1392' '1357' '776'
 '883' '765' '706' '500' '599' '555' '504' '18114' '822' '682' '602' '634'
 '742' '775' '887' '1049' '873' '904' '890' '896' '703' '812' '744' '699'
 '709' '717' '621' '18116' '1011' '930' '924' '733' '735' '714' '1013'
 '636' '1113' '1052' '1037' '444' '710' '786' '639' '689' '1028' '1148'
 '516' '894' '854' '753' '18122' '877' '1100' '1445' '976' '716' '884'
 '866' '845' '743' '724' '725' '646' '652' '640' '597' '18124' '902' '531'
 '870' '801' '650' '754' '1097' '730' '530' '1231' '950' '656' '793' '781'
 '653' '673' '1152' '943' '944' '921' '807' '660' '638' '641' '588' '783'
 '766' '670' '927' '1051' '654' '18129' '923' '790' '852' '815' '680'
 '649' '1002' '778' '779' '583' '18131' '522' '874' '759' '18133' '18134'
 '828' '777' '723' '674' '1288' '898' '871' '851' '1434' '1262' '876'
 '1091' '948' '839' '853' '745' '1437' '1117' '817' '1251' '18140' '889'
 '1380' '18141' '820' '734' '683' '1083']
['18' '8' '5' '53' '221' '70' '3' '28' '26' '12' '156' '2' '7' '86' '57'
 '23' '9' '6' '49' '16' '468' '1' 'Never checkout' '139' '128' '40' '22'
 '13' '4' '21' '60' '15' '420' '54' '50' '123' '855' '17' '95' '29' '25'
 '10' '172' '499' '11' '190' '111' '214' '186' '58' '319' '76' '1244' '20'
 '175' '14' '63' '38' '1122' '75' '83' '64' '1167' '48' '34' '290' '433'
 '168' '101' '35' '148' '39' '84' '69' '30' '32' '37' '205' '166' '143'
 '230' '19' '1118' '124' '224' '622' '676' '939' '246' '78' '46' '118'
 '61' '62' '1024' '79' '326' '415' '36' '659' '112' '107' '47' '272' '434'
 '31' '41' '142' '243' '27' '283' '45' '413' '105' '151' '316' '379' '202'
 '43' '451' '33' '68' '533' '232' '87' '91' '147' '572' '917' '144' '104'
 '59' '403' '67' '187' '306' '700' '184' '106' '66' '551' '456' '358' '55'
 '235' '452' '85' '719' '442' '51' '110' '355' '73' '310' '215' '527'
 '328' '302' '194' '259' '237' '597' '493' '71' '137' '188' '479' '743'
 '727' '746' '425' '311' '81' '474' '470' '158' '129' '627' '119' '343'
 '614' '219' '125' '650' '133' '303' '120' '692' '108' '24' '431' '94'
 '271' '321' '356' '341' '122' '675' '600' '204' '308' '667' '274' '162'
 '222' '42' '161' '654' '655' '440' '100' '97' '647' '211' '225' '645'
 '478' '173' '65' '349' '387' '134' '89' '216' '80' '609' '409' '611'
 '406' '333' '249' '570' '56' '77' '594' '581' '185' '229' '213' '247'
 '584' '167' '126' '279' '195' '574' '535' '523' '217' '548' '502' '155'
 '505' '322' '278' '394' '529' '115' '534' '169' '109' '152' '344' '514'
 '269' '189' '176' '150' '200' '501' '500' '404' '96' '481' '174' '471'
 '93' '407' '132' '288' '154' '312' '462' '131' '459' '291' '383' '146'
 '449' '121' '336' '416' '233' '72' '427' '242' '350' '275' '418' '294'
 '257' '159' '99' '90' '410' '183' '258' '389' '385' '74' '374' '378'
 '372' '331' '364' '365' '113' '197' '261' '354' '212' '52' '345' '264'
 '330' '231' '324' '299' '182' '320' '191' '250' '218' '177' '309' '268'
 '255' '289' '277' '116' '227' '263' '209' '203' '164' '180' '245' '160'
 '82' '223' '192' '102' '206' '44' '127' '140' '98' '103' '566' '1260'
 '1132' '347' '1274' '426' '170' '635' '582' '802' '863' '948' '210'
 '1000' '136' '236' '267' '138' '117' '305' '88' '179' '586' '705' '130'
 '681' '135' '270' '473' '145' '642' '342' '178' '618' '377' '605' '487'
 '114' '244' '329' '547' '92' '359' '503' '357' '332' '256' '297' '199'
 '398' '397' '395' '390' '360' '393' '157' '171' '325' '318' '234' '284'
 '207' '286' '262' '248' '1169' '254' '516' '1026' '994' '504' '817' '811'
 '141' '749' '208' '686' '165' '578' '688' '239' '610' '517' '496' '421'
 '464' '386' '461' '399' '380' '414' '352' '400' '362' '196' '300' '273'
 '193' '201' '149' '1039' '1162' '435' '730' '466' '228' '644' '388' '737'
 '831' '453' '733' '491' '327' '335' '602' '731' '710' '543' '696' '658'
 '666' '641' '488' '313' '429' '454' '573' '486' '576' '281' '437' '315'
 '198' '241' '484' '408' '373' '367' '363' '460' '467' '220' '375' '436'
 '240' '282' '339' '396' '266' '317' '340' '337' '285' '226' '531' '251'
 '369' '980' '885' '643' '401' '422' '706' '580' '678' '638' '603' '616'
 '443' '608' '558' '560' '508' '368' '238' '298' '850' '323' '412' '893'
 '252' '265' '334' '392' '153' '348' '465' '536' '482' '382' '253' '402'
 '346' '260' '1138' '163' '595' '869' '625' '623' '803' '448' '301' '485'
 '490' '361' '636' '556' '565' '511' '540' '515' '509' '445' '480' '457'
 '314' '438' '432' '381' '366' '338' '537' '510' '181' '742' '351' '430'
 '424' '801' '691' '411' '599' '455' '717' '649' '469' '307' '683' '439'
 '672' '670' '663' '657' '526' '624' '613' '596' '604' '592' '296' '384'
 '295' '541' '506' '371' '475' '441' '376' '293' '287' '865' '756' '698'
 '631' '1047' '782' '653' '498' '549' '538' '292' '542' '1124' '835' '775'
 '878' '701' '931' '552' '930' '883' '419' '569' '769' '697' '507' '450'
 '621' '598' '575' '545' '353' '513' '446' '497' '998' '477' '819' '652'
 '805' '601' '304' '519' '921' '571' '391' '405' '1158' '738' '521' '927'
 '777' '577' '612' '755' '495' '646' '664' '626' '530' '588' '423' '550'
 '567' '444' '476' '370' '280' '1209' '703' '984' '1070' '587' '684' '754'
 '815' '753' '593' '606' '554' '585' '532' '539' '483' '463' '615' '1392'
 '276' '986' '1130' '792' '1003' '800' '771' '637' '953' '690' '966' '806'
 '779' '773' '568' '709' '656' '557' '687' '544' '1281' '1252' '1046'
 '757' '564' '807' '739' '634' '447' '472' '752' '794' '721' '562' '489'
 '1147' '1096' '778' '693' '763' '1016' '791' '589' '797' '783' '492'
 '559' '894' '910' '1140' '1100' '916' '639' '525' '990' '864' '673' '857'
 '845' '920' '766' '740' '843' '780' '824' '784' '776' '633' '722' '725'
 '630' '512' '1179' '1393' '1193' '1082' '923' '890' '736' '553' '528'
 '760' '715' '617' '590' '1103' '723' '804' '702' '583' '494' '1445'
 '1430' '1127' '1128' '1037' '1107' '874' '591' '876' '987' '892' '888'
 '854' '868' '834' '851' '751' '786' '720' '711' '674' '620' '661' '640'
 '563' '561' '915' '1063' '761' '718' '1336' '1230' '1079' '1105' '983'
 '787' '853' '750' '765' '648' '1199' '682' '1066' '607' '555' '1206'
 '820' '522' '417' '518' '1203' '669' '1033' '781' '989' '825' '619' '785'
 '741' '651' '821' '428' '708' '660' '677' '1391' '1040' '891' '861' '668'
 '629' '632' '1420' '908' '1168' '832' '679' '943' '1153' '944' '1030'
 '1056' '974' '685' '1159' '1337' '716' '704' '1114' '1275' '1324' '1270'
 '1110' '947' '524' '1042' '1304' '1049' '1116' '897' '867' '818' '712'
 '1076' '458' '988' '965' '822' '699' '1186' '799' '520' '1010' '848'
 '907' '814' '1210' '764' '887' '837' '836' '1377' '714' '732' '546'
 '1028' '1091' '1350' '1313' '1301' '1072' '724' '1022' '942' '852' '905'
 '758' '1085' '901' '1012' '996' '982' '1380' '1201' '1283' '1027' '748'
 '1417' '1087']
In [10]:
traindf.isna().sum()
Out[10]:
country_code                   0
grass_date                     0
user_id                        0
subject_line_length            0
last_open_day                  0
last_login_day                 0
last_checkout_day              0
open_count_last_10_days        0
open_count_last_30_days        0
open_count_last_60_days        0
login_count_last_10_days       0
login_count_last_30_days       0
login_count_last_60_days       0
checkout_count_last_10_days    0
checkout_count_last_30_days    0
checkout_count_last_60_days    0
open_flag                      0
row_id                         0
dtype: int64
In [11]:
msno.matrix(traindf) 
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ba02d1a048>
In [12]:
userdf.head()
Out[12]:
user_id attr_1 attr_2 attr_3 age domain
0 0 NaN 1.0 0.0 NaN @gmail.com
1 1 1.0 1.0 2.0 50.0 @gmail.com
2 2 NaN 1.0 0.0 NaN other
3 3 NaN 1.0 0.0 NaN @gmail.com
4 4 1.0 1.0 2.0 33.0 @gmail.com
In [13]:
userdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127886 entries, 0 to 127885
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  127886 non-null  int64  
 1   attr_1   78987 non-null   float64
 2   attr_2   127439 non-null  float64
 3   attr_3   127886 non-null  float64
 4   age      78987 non-null   float64
 5   domain   127886 non-null  object 
dtypes: float64(4), int64(1), object(1)
memory usage: 5.9+ MB
In [14]:
userdf.describe()
Out[14]:
user_id attr_1 attr_2 attr_3 age
count 127886.000000 78987.000000 127439.000000 127886.000000 78987.000000
mean 63961.168760 0.999823 0.998007 1.079782 31.370124
std 36928.842572 0.013312 0.044600 1.035413 11.591752
min 0.000000 0.000000 0.000000 0.000000 -17.000000
25% 31981.250000 1.000000 1.000000 0.000000 24.000000
50% 63958.500000 1.000000 1.000000 1.000000 29.000000
75% 95941.750000 1.000000 1.000000 2.000000 36.000000
max 127925.000000 1.000000 1.000000 4.000000 118.000000
In [15]:
userdf.isna().sum()
Out[15]:
user_id        0
attr_1     48899
attr_2       447
attr_3         0
age        48899
domain         0
dtype: int64
In [16]:
msno.matrix(userdf) 
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ba025449c8>
In [17]:
userdf['attr_1'].fillna(2.0, inplace = True)
userdf['attr_2'].fillna(userdf['attr_2'].value_counts().index[0], inplace=True)
In [18]:
import statistics 
median = round(userdf['age'].median())
std = userdf['age'].std()
outliers = (userdf['age']-median).abs()>std
userdf['age'][outliers]=np.nan
userdf['age'].fillna(median, inplace=True)
C:\Users\USER\anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
In [19]:
msno.matrix(userdf) 
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ba0379e5c8>

Target variables

In [20]:
#imbalance
fig = px.histogram(traindf, x="open_flag")
fig.show()
In [21]:
traindf['open_flag'].value_counts(normalize = True)
Out[21]:
0    0.844219
1    0.155781
Name: open_flag, dtype: float64

Input variables

In [22]:
plt.figure(figsize = (10, 10))
traindf['grass_date'].hist()
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ba072d8ec8>
In [23]:
#scalization. normalization
col_ele = traindf.columns.tolist()
for remove_ele in ["country_code",'grass_date',"user_id","last_open_day","last_login_day","last_checkout_day","row_id","open_flag"]:
    col_ele.remove(remove_ele)
    
fig = make_subplots(rows=3, cols=4, x_title='value',
                    y_title='count',subplot_titles=(col_ele))

for i, name in enumerate(col_ele):
    trace = go.Histogram(x=traindf[name], marker = {'color': "gray"})
    fig.append_trace(trace, i//4+1, i%4+1)

fig.update_layout(showlegend=False)
fig.show()
In [24]:
fig = px.box(traindf, y = col_ele)
fig.show()
In [25]:
lbls, freqs = np.unique(traindf['country_code'].values, return_counts = True)
#print(list(zip(lbls, freqs)))

plt.figure(figsize = (10, 10))
plt.title('Train - Country Code')
plt.pie(freqs, labels = lbls, autopct = '%1.1f%%', shadow = False, startangle = 90)
plt.show()
In [26]:
plt.figure(figsize = (10, 10))
sns.distplot(userdf['age'])
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ba0745e308>

Data Clean

In [27]:
traindf = pd.merge(traindf, userdf, on = 'user_id')
testdf = pd.merge(testdf, userdf, on = 'user_id')
display(traindf.head(), testdf.head())
country_code grass_date user_id subject_line_length last_open_day last_login_day last_checkout_day open_count_last_10_days open_count_last_30_days open_count_last_60_days ... checkout_count_last_10_days checkout_count_last_30_days checkout_count_last_60_days open_flag row_id attr_1 attr_2 attr_3 age domain
0 4 2019-07-16 43 44 19 6 18 0 2 4 ... 0 5 10 0 0 1.0 1.0 2.0 29.0 @gmail.com
1 4 2019-07-16 102 44 9 4 8 2 9 17 ... 1 1 4 1 1 1.0 1.0 2.0 25.0 @hotmail.com
2 6 2019-07-16 177 49 14 5 5 0 4 12 ... 5 19 27 0 2 2.0 1.0 0.0 29.0 @yahoo.com
3 1 2019-07-16 184 49 49 9 53 0 0 1 ... 1 3 6 0 3 1.0 1.0 2.0 24.0 @yahoo.com
4 6 2019-07-16 221 49 227 6 221 0 0 0 ... 0 0 0 0 4 2.0 1.0 0.0 29.0 @hotmail.com

5 rows × 23 columns

country_code grass_date user_id subject_line_length last_open_day last_login_day last_checkout_day open_count_last_10_days open_count_last_30_days open_count_last_60_days ... login_count_last_60_days checkout_count_last_10_days checkout_count_last_30_days checkout_count_last_60_days row_id attr_1 attr_2 attr_3 age domain
0 6 2019-09-03 0 35 27 2 13 2 3 4 ... 134 0 6 18 0 2.0 1.0 0.0 29.0 @gmail.com
1 6 2019-09-03 130 35 7 5 383 1 1 1 ... 5 0 0 0 1 2.0 1.0 0.0 29.0 @gmail.com
2 5 2019-09-03 150 25 34 1 3 0 0 0 ... 38 2 2 2 2 1.0 1.0 2.0 33.0 @gmail.com
3 1 2019-09-03 181 36 63 5 5 0 0 0 ... 173 2 5 5 3 1.0 1.0 2.0 22.0 @yahoo.com
4 5 2019-09-03 192 23 6 5 54 0 0 0 ... 39 0 0 2 4 2.0 1.0 0.0 29.0 @gmail.com

5 rows × 22 columns

In [28]:
#date
traindf['year'] = pd.to_datetime(traindf['grass_date']).dt.year
traindf['month'] = pd.to_datetime(traindf['grass_date']).dt.month
traindf['day'] = pd.to_datetime(traindf['grass_date']).dt.day

testdf['year'] = pd.to_datetime(testdf['grass_date']).dt.year
testdf['month'] = pd.to_datetime(testdf['grass_date']).dt.month
testdf['day'] = pd.to_datetime(testdf['grass_date']).dt.day
In [29]:
del traindf['grass_date'], testdf['grass_date'], traindf['user_id'], testdf['user_id'], traindf['row_id'], testdf['row_id']
In [30]:
#as type
traindf['last_open_day'] = traindf['last_open_day'].replace('Never open', 0)
traindf['last_login_day'] = traindf['last_login_day'].replace('Never login', 0)
traindf['last_checkout_day'] = traindf['last_checkout_day'].replace('Never checkout', 0)

testdf['last_open_day'] = testdf['last_open_day'].replace('Never open', 0)
testdf['last_login_day'] = testdf['last_login_day'].replace('Never login', 0)
testdf['last_checkout_day'] = testdf['last_checkout_day'].replace('Never checkout', 0)
In [31]:
for col in traindf.columns:
    if traindf[col].dtype == 'object'and col != 'domain':
        traindf[col] = traindf[col].astype(np.int32)
In [32]:
for col in testdf.columns:
    if testdf[col].dtype == 'object' and col != 'domain':
        testdf[col] = testdf[col].astype(np.int32)
In [33]:
fig = make_subplots(rows=1, cols=3, x_title='value',
                    y_title='count',subplot_titles=(col_ele))

for i, name in enumerate(["last_open_day","last_login_day","last_checkout_day"]):
    trace = go.Histogram(x=traindf[name], marker = {'color': "gray"})
    fig.append_trace(trace, i//3+1, i%3+1)

fig.update_layout(showlegend=False)
fig.show()
In [34]:
traindf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73539 entries, 0 to 73538
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country_code                 73539 non-null  int64  
 1   subject_line_length          73539 non-null  int64  
 2   last_open_day                73539 non-null  int32  
 3   last_login_day               73539 non-null  int32  
 4   last_checkout_day            73539 non-null  int32  
 5   open_count_last_10_days      73539 non-null  int64  
 6   open_count_last_30_days      73539 non-null  int64  
 7   open_count_last_60_days      73539 non-null  int64  
 8   login_count_last_10_days     73539 non-null  int64  
 9   login_count_last_30_days     73539 non-null  int64  
 10  login_count_last_60_days     73539 non-null  int64  
 11  checkout_count_last_10_days  73539 non-null  int64  
 12  checkout_count_last_30_days  73539 non-null  int64  
 13  checkout_count_last_60_days  73539 non-null  int64  
 14  open_flag                    73539 non-null  int64  
 15  attr_1                       73539 non-null  float64
 16  attr_2                       73539 non-null  float64
 17  attr_3                       73539 non-null  float64
 18  age                          73539 non-null  float64
 19  domain                       73539 non-null  object 
 20  year                         73539 non-null  int64  
 21  month                        73539 non-null  int64  
 22  day                          73539 non-null  int64  
dtypes: float64(4), int32(3), int64(15), object(1)
memory usage: 12.6+ MB
In [35]:
testdf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55970 entries, 0 to 55969
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country_code                 55970 non-null  int64  
 1   subject_line_length          55970 non-null  int64  
 2   last_open_day                55970 non-null  int32  
 3   last_login_day               55970 non-null  int32  
 4   last_checkout_day            55970 non-null  int32  
 5   open_count_last_10_days      55970 non-null  int64  
 6   open_count_last_30_days      55970 non-null  int64  
 7   open_count_last_60_days      55970 non-null  int64  
 8   login_count_last_10_days     55970 non-null  int64  
 9   login_count_last_30_days     55970 non-null  int64  
 10  login_count_last_60_days     55970 non-null  int64  
 11  checkout_count_last_10_days  55970 non-null  int64  
 12  checkout_count_last_30_days  55970 non-null  int64  
 13  checkout_count_last_60_days  55970 non-null  int64  
 14  attr_1                       55970 non-null  float64
 15  attr_2                       55970 non-null  float64
 16  attr_3                       55970 non-null  float64
 17  age                          55970 non-null  float64
 18  domain                       55970 non-null  object 
 19  year                         55970 non-null  int64  
 20  month                        55970 non-null  int64  
 21  day                          55970 non-null  int64  
dtypes: float64(4), int32(3), int64(14), object(1)
memory usage: 9.2+ MB
In [36]:
target = traindf['open_flag'].copy()
del traindf['open_flag']
In [37]:
cat_features = ['country_code', 'domain', 'year', 'month', 'day', 'attr_1', 'attr_2', 'attr_3']
num_features = [col for col in traindf.columns if col not in cat_features]
print(cat_features, num_features)
['country_code', 'domain', 'year', 'month', 'day', 'attr_1', 'attr_2', 'attr_3'] ['subject_line_length', 'last_open_day', 'last_login_day', 'last_checkout_day', 'open_count_last_10_days', 'open_count_last_30_days', 'open_count_last_60_days', 'login_count_last_10_days', 'login_count_last_30_days', 'login_count_last_60_days', 'checkout_count_last_10_days', 'checkout_count_last_30_days', 'checkout_count_last_60_days', 'age']
In [38]:
plt.figure(figsize=(14,10))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(traindf[num_features].corr(), cmap = "BrBG",
linewidths=0.1, square=True, linecolor='white',
annot=True)
plt.show()

Step 2 Modeling

In [39]:
from sklearn.model_selection import train_test_split
# Train_Test Split
Xtrain, Xvalid, ytrain, yvalid = train_test_split(traindf,target,test_size=0.30)
In [40]:
 ytrain
Out[40]:
35816    0
104      0
57270    0
36162    0
2128     0
        ..
1181     0
49015    1
48898    0
56742    0
16679    0
Name: open_flag, Length: 51477, dtype: int64
In [41]:
#preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder, MinMaxScaler
preprocessor = make_column_transformer(
    (OneHotEncoder(), cat_features),
    (MinMaxScaler(), num_features),
    remainder='passthrough'
)
#preprocessor1 = make_column_transformer(
#    (OneHotEncoder(), cat_features),
#    (StandardScaler(), num_features),
#    remainder='passthrough'
#)
In [42]:
#!pip install imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn import svm

models = []

#linear model
models.append(("simple_logistic", 
               make_pipeline(preprocessor,
                             SMOTE(sampling_strategy=0.4),
                             RandomUnderSampler(sampling_strategy=0.5),
                             LogisticRegression())))
models.append(("simple_logistic_elaticnet", 
               make_pipeline(preprocessor,
                             SMOTE(sampling_strategy=0.4),
                             RandomUnderSampler(sampling_strategy=0.5),
                             SGDClassifier(loss='log', penalty='elasticnet', alpha=0.0001, l1_ratio=0.15))))
models.append(("naivebayes", 
               make_pipeline(preprocessor,
                             SMOTE(sampling_strategy=0.4),
                             RandomUnderSampler(sampling_strategy=0.5),
                             BernoulliNB())))
#non-linear model
models.append(("random_forest", 
               make_pipeline(preprocessor,
                             SMOTE(sampling_strategy=0.4),
                             RandomUnderSampler(sampling_strategy=0.5),
                             RandomForestClassifier(max_depth=2, random_state=0))))

models.append(("xgboost", 
               make_pipeline(preprocessor,
                             SMOTE(sampling_strategy=0.4),
                             RandomUnderSampler(sampling_strategy=0.5),
                             xgb.XGBClassifier())))
In [43]:
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.model_selection import RepeatedKFold, cross_validate
from sklearn.metrics import matthews_corrcoef
#k-fold
my_ctrl = RepeatedKFold(n_splits=5, n_repeats=2)
#performance matrix
mcc = make_scorer(matthews_corrcoef)
my_metric =(mcc)
In [44]:
from numpy import absolute, mean, std
model_results = []
names = []
for name, model in models:
    cv_results = cross_validate(model, Xtrain, ytrain, scoring=my_metric, cv=my_ctrl, n_jobs=-1)
    model_results.append(cv_results["test_score"])
    names.append(name)
    msg = "%s: %f (%f)" % (name, mean(absolute(model_results)), std(absolute(model_results)))
    print(msg)
simple_logistic: 0.511520 (0.012899)
simple_logistic_elaticnet: 0.509366 (0.013280)
naivebayes: 0.451754 (0.082507)
random_forest: 0.458537 (0.072717)
xgboost: 0.469920 (0.069044)
In [45]:
Mcc = pd.DataFrame(absolute(model_results),index=names).T
In [46]:
import plotly.graph_objects as go
fig = go.Figure()
for name in names:
    fig.add_trace(go.Box(x=Mcc[name],name=name))
fig.update_layout(title_text="Mcc")
fig.show()
In [47]:
models[4][1].fit(Xtrain, ytrain)
Out[47]:
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['country_code', 'domain',
                                                   'year', 'month', 'day',
                                                   'attr_1', 'attr_2',
                                                   'attr_3']),
                                                 ('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['subject_line_length',
                                                   'last_open_day',
                                                   'last_login_day',
                                                   'last_checkout_day',
                                                   'open_count_last_10_days',
                                                   'open_count_last...s',
                                                   'open_count_last_60_days',
                                                   'login_count_last_10_days',
                                                   'login_count_last_30_days',
                                                   'login_count_last_60_days',
                                                   'checkout_count_last_10_days',
                                                   'checkout_count_last_30_days',
                                                   'checkout_count_last_60_days',
                                                   'age'])])),
                ('smote', SMOTE(sampling_strategy=0.4)),
                ('randomundersampler',
                 RandomUnderSampler(sampling_strategy=0.5)),
                ('xgbclassifier', XGBClassifier())])
In [48]:
models[4][1].predict(Xvalid)
Out[48]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [49]:
matthews_corrcoef(yvalid,  models[4][1].predict(Xvalid))
Out[49]:
0.5167831947390359

submit

In [50]:
testdf.head()
Out[50]:
country_code subject_line_length last_open_day last_login_day last_checkout_day open_count_last_10_days open_count_last_30_days open_count_last_60_days login_count_last_10_days login_count_last_30_days ... checkout_count_last_30_days checkout_count_last_60_days attr_1 attr_2 attr_3 age domain year month day
0 6 35 27 2 13 2 3 4 10 34 ... 6 18 2.0 1.0 0.0 29.0 @gmail.com 2019 9 3
1 6 35 7 5 383 1 1 1 5 5 ... 0 0 2.0 1.0 0.0 29.0 @gmail.com 2019 9 3
2 5 25 34 1 3 0 0 0 13 19 ... 2 2 1.0 1.0 2.0 33.0 @gmail.com 2019 9 3
3 1 36 63 5 5 0 0 0 43 110 ... 5 5 1.0 1.0 2.0 22.0 @yahoo.com 2019 9 3
4 5 23 6 5 54 0 0 0 4 12 ... 0 2 2.0 1.0 0.0 29.0 @gmail.com 2019 9 3

5 rows × 22 columns

In [51]:
predictions = models[4][1].predict(testdf)
In [52]:
path = 'D:/USA 2021 Spring/Data_analysis/sample_submission_0_1.csv'
sampledf = pd.read_csv(path)
sampledf
Out[52]:
row_id open_flag
0 0 1
1 1 1
2 2 1
3 3 1
4 4 1
... ... ...
55965 55965 0
55966 55966 0
55967 55967 0
55968 55968 0
55969 55969 0

55970 rows × 2 columns

In [53]:
sampledf['open_flag'] = predictions
sampledf.to_csv('D:/USA 2021 Spring/Data_analysis/sample_sub_ShopeeEmail.csv', index = False)
sampledf
Out[53]:
row_id open_flag
0 0 0
1 1 0
2 2 0
3 3 0
4 4 0
... ... ...
55965 55965 0
55966 55966 0
55967 55967 0
55968 55968 0
55969 55969 1

55970 rows × 2 columns

In [54]:
fig = px.histogram(sampledf, x="open_flag")
fig.show()
In [55]:
sampledf['open_flag'].value_counts(normalize = True)
Out[55]:
0    0.868966
1    0.131034
Name: open_flag, dtype: float64